library(ggplot2)
pf <- read.csv("pseudo_facebook.tsv", sep = "\t")
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point()
summary(pf$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 13.00 20.00 28.00 37.28 50.00 113.00
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point() + xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes: The bulk of data lies below the 1000 threshold for friend count. Using “jitter”, we can add some noise to each age so we get a clearer picture of the relationship between age and friend count (as age is a continuous variable)
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13, 90)
## Warning: Removed 5169 rows containing missing values (geom_point).
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20) +
xlim(13, 90) +
coord_trans(y = "sqrt")
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age, y = friend_count), data = pf) +
geom_point(alpha = 1/20, position = position_jitter(h = 0)) +
xlim(13, 90) +
coord_trans(y = "sqrt")
## Warning: Removed 5198 rows containing missing values (geom_point).
Explore the relationship between friends initiated vs age.
ggplot(data = pf, aes(x = age, y = friendships_initiated)) +
geom_point(alpha = 1/10, position = position_jitter(h = 0)) +
xlim(13, 90) +
coord_trans(y = "sqrt")
## Warning: Removed 5179 rows containing missing values (geom_point).
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
## # A tibble: 6 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
Create your plot!
ggplot(data = pf.fc_by_age, aes(age, friend_count_mean)) +
geom_line()
ggplot(aes(x = age, y = friend_count), data = pf) +
coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
geom_point(alpha = 0.05,
position = position_jitter(h = 0),
color = "orange") +
geom_line(stat = "summary", fun.y = mean) +
geom_line(stat = "summary", fun.y = quantile,
fun.args = list(probs = .1),
linetype = 2, color = "blue") +
geom_line(stat = "summary", fun.y = quantile,
fun.args = list(probs = .5),
color = "blue") +
geom_line(stat = "summary", fun.y = quantile,
fun.args = list(probs = .9),
linetype = 2, color = "blue")
cor.test(pf$age, pf$friend_count, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
What’s the correlation between age and friend count? Round to three decimal places.
with(subset(pf, age <= 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
Pearson, Kendall, Spearman
ggplot(data = pf, aes(www_likes_received, likes_received)) +
geom_point()
ggplot(data = pf, aes(www_likes_received, likes_received)) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = "lm", color = "red")
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.
with(pf, cor.test(www_likes_received, likes_received))
##
## Pearson's product-moment correlation
##
## data: www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
# install.packages('alr3')
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
data(Mitchell)
?Mitchell
head(Mitchell)
## Month Temp
## 1 0 -5.18333
## 2 1 -1.65000
## 3 2 2.49444
## 4 3 10.40000
## 5 4 14.99440
## 6 5 21.71670
Create your plot!
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
geom_point()
with(Mitchell, cor.test(Month, Temp))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
Seems like a pretty weak correlation.
range(Mitchell$Month)
## [1] 0 203
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12))
What do you notice?
Watch the solution video and check out the Instructor Notes!
ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+
geom_point()
# install.packages("energy")
library(energy)
with(Mitchell, dcor.ttest(Month, Temp))
##
## dcor t-test of independence
##
## data: Month and Temp
## T = -0.93904, df = 20501, p-value = 0.8261
## sample estimates:
## Bias corrected dcor
## -0.006558215
ggplot(data = pf.fc_by_age, aes(age, friend_count_mean)) +
geom_line()
head(pf.fc_by_age, 10)
## # A tibble: 10 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
pf.fc_by_age[17:19, ]
## # A tibble: 3 x 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 29 120.8182 66.0 1936
## 2 30 115.2080 67.5 1716
## 3 31 118.4599 63.0 1694
pf$age_with_months <- pf$age + (1 - pf$dob_month / 12)
# Alternative solution
pf$age_with_months <- with(pf, age + (1 - dob_month / 12))
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
## age_with_months friend_count_mean friend_count_median n
## <dbl> <dbl> <dbl> <int>
## 1 13.16667 46.33333 30.5 6
## 2 13.25000 115.07143 23.5 14
## 3 13.33333 136.20000 44.0 25
## 4 13.41667 164.24242 72.0 33
## 5 13.50000 131.17778 66.0 45
## 6 13.58333 156.81481 64.0 54
ggplot(data = subset(pf.fc_by_age_months, age_with_months < 71),
aes(age_with_months, friend_count_mean)) +
geom_line()
p1 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age < 71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months < 71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age / 5) * 5, y = friend_count),
data = subset(pf, age < 71)) +
geom_line(stat = "summary", fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(p1, p2, p3, ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
You don’t have to choose. In exploratory data analysis, we often create multiple visualisations and summaries of the same data. Each graph or summary may reveal different things about the same data.
Reflection
data("diamonds")
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(data = diamonds, aes(x = x, y = price)) +
geom_point()
with(diamonds, cor.test(price, x))
##
## Pearson's product-moment correlation
##
## data: price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8825835 0.8862594
## sample estimates:
## cor
## 0.8844352
with(diamonds, cor.test(price, y))
##
## Pearson's product-moment correlation
##
## data: price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8632867 0.8675241
## sample estimates:
## cor
## 0.8654209
with(diamonds, cor.test(price, z))
##
## Pearson's product-moment correlation
##
## data: price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8590541 0.8634131
## sample estimates:
## cor
## 0.8612494
ggplot(data = diamonds, aes(x = x, y = depth)) +
geom_point()
range(diamonds$x)
## [1] 0.00 10.74
ggplot(data = diamonds, aes(x = x, y = depth)) +
geom_point(alpha = 1/100) +
scale_x_continuous(breaks=seq(0,11,2))
with(diamonds, cor.test(price, depth))
##
## Pearson's product-moment correlation
##
## data: price and depth
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.019084756 -0.002208537
## sample estimates:
## cor
## -0.0106474
ggplot(data = subset(diamonds, diamonds$price < quantile(diamonds$price, 0.99) &
diamonds$carat < quantile(diamonds$carat, 0.99)),
aes(x = price, y = carat)) +
geom_point(alpha = 1/20)
diamonds$volume <- diamonds$x * diamonds$y * diamonds$z
ggplot(data = diamonds, aes(x = price, y = volume)) +
geom_point()
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
count(diamonds$volume == 0)
## x freq
## 1 FALSE 53920
## 2 TRUE 20
Exclude diamonds that have a volume greater than or equal to 800 and diamonds that have a volume of 0.
with(subset(diamonds, !(volume == 0 | volume >= 800)), cor.test(price, volume))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
# Or
with(subset(diamonds, volume > 0 & volume < 800), cor.test(price, volume))
##
## Pearson's product-moment correlation
##
## data: price and volume
## t = 559.19, df = 53915, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9222944 0.9247772
## sample estimates:
## cor
## 0.9235455
Subset the data to exclude diamonds with a volume greater than or equal to 800 and diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot.
diamonds_0_800 <- subset(diamonds, volume > 0 & volume < 800)
ggplot(data = diamonds_0_800, aes(x = price, y = volume)) +
geom_point(alpha = 1/10, color = "yellow") +
geom_smooth(method = "lm", formula = y ~ x, size = 0.5) +
geom_smooth(method = "lm", formula = y ~ poly(x,2), size = 1, color = "green") +
geom_smooth(method = "lm", formula = y ~ poly(x,3), size = 1, color = "pink") +
geom_smooth(method = "gam", formula = y ~ s(x), size = 1, color = "violet") +
geom_smooth(method = "gam", formula = y ~ s(x, k = 3), size = .5, color = "red") +
coord_cartesian(ylim = c(0, 400))
Note: If you used the count() function from the plyr package before this exercise. You need to run this command to unload the plyr package.
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'scales', 'ggplot2' so cannot be unloaded
diamondsByClarity <- diamonds %>%
group_by(clarity) %>%
summarise(mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n()) %>%
arrange(clarity)
head(diamondsByClarity)
## # A tibble: 6 x 6
## clarity mean_price median_price min_price max_price n
## <ord> <dbl> <dbl> <int> <int> <int>
## 1 I1 3924.169 3344 345 18531 741
## 2 SI2 5063.029 4072 326 18804 9194
## 3 SI1 3996.001 2822 326 18818 13065
## 4 VS2 3924.989 2054 334 18823 12258
## 5 VS1 3839.455 2005 327 18795 8171
## 6 VVS2 3283.737 1311 336 18768 5066
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
Notes: There are two types of bar charts, determined by what is mapped to bar height. By default, geom_bar uses stat=“count” which makes the height of the bar proportion to the number of cases in each group (or if the weight aesthetic is supplied, the sum of the weights). If you want the heights of the bars to represent values in the data, use stat=“identity” and map a variable to the y aesthetic.
p1 <- ggplot(aes(clarity, mean_price), data = diamondsByClarity) +
geom_bar(stat = 'identity')
p2 <- ggplot(aes(color, mean_price), data = diamonds_mp_by_color) +
geom_bar(stat = 'identity')
grid.arrange(p1, p2, ncol = 1)